/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package cn.ac.iie.di.tools;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 *
 * @author Austin
 */
public class HtmlParserTest {

    public static void main(String[] args) throws IOException {

        //日期正则 http://www.jb51.net/article/28034.htm
        Pattern pattern = Pattern.compile("^(?:(?!0000)[0-9]{4}-(?:(?:0[1-9]|1[0-2])-(?:0[1-9]|1[0-9]|2[0-8])|(?:0[13-9]|1[0-2])-(?:29|30)|(?:0[13578]|1[02])-31)|(?:[0-9]{2}(?:0[48]|[2468][048]|[13579][26])|(?:0[48]|[2468][048]|[13579][26])00)-02-29)$");
        Matcher matcher = null;

        //JDK1.8 遍历文件
        List<String> text = Files.readAllLines(Paths.get("D:\\123\\pa_gmsg_tran\\new1234.txt"));

        for (String aText : text) {

            Document doc = Jsoup.parse(aText);

            /**
             * ************************************************************************
             */
            //先判断em标签是否存在
            Elements elements = doc.getElementsByTag("em");
            //em存在
            if (elements.size() > 0) {

                //若em标签不止一个则只取第一个
                String em = elements.get(0).text();

                //不为空则判断格式是否正确
                matcher = pattern.matcher(em);
                if (!matcher.find()) {
                    //格式错误做如下操作
                    System.out.println("格式错误！" + em);
                    continue;
                }

                SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
                long emTime = 0;
                try {
                    emTime = formatter.parse(em).getTime() / 1000;
                    System.out.println(emTime);
                } catch (Exception e) {
                    //do something...
                }
            }
            /**
             * ************************************************************************
             */

        }
    }
}
